# import bibliotek oraz ustawienia
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import pycountry_convert as pc
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
plt.style.use('fivethirtyeight')
food = pd.read_csv("fullcifocoss.csv", on_bad_lines='skip', sep=';', skipinitialspace = True) # wczytanie danych do dataframe pozbywając sie rzędów, które mają za dużo pól oraz spacji
pd.set_option('display.max_columns', None) # sprawiam że można przejrzeć wszystkie columny ponieważ by deafault jest ograniczona ilość wyświetlanych kolumn
food.head()
| BW | Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Consumers_Mean | Consumers_Median | Consumers_P05 | Consumers_P90 | Consumers_P95 | Consumers_P975 | Consumers_Standard_deviation | Number_of_subjects | Total_Mean | Total_Median | Total_P05 | Total_P90 | Total_P95 | Total_P975 | Total_Standard_deviation | ExtBW | ExtBWValue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | China | 2002 | A000G | Oat grain | All | All | All | 1157 | 60.6230 | 0.0 | 8.3333 | 116.6667 | 150.0000 | 166.6667 | NaN | 66172 | 1.0600 | 0.0 | 0.0 | NaN | 0.0 | NaN | NaN | NaN | NaN |
| 1 | 0 | China | 2002 | A000G | Oat grain | All | All | female | 608 | 55.8676 | 0.0 | 8.3333 | 116.6667 | 133.3333 | 158.3333 | NaN | 33953 | 1.0004 | 0.0 | 0.0 | NaN | 0.0 | NaN | NaN | NaN | NaN |
| 2 | 0 | China | 2002 | A000G | Oat grain | All | All | male | 549 | 65.8895 | 0.0 | 10.0000 | 133.3333 | 158.3333 | 166.6667 | NaN | 32219 | 1.1227 | 0.0 | 0.0 | NaN | 0.0 | NaN | NaN | NaN | NaN |
| 3 | 0 | China | 2002 | A000N | Buckwheat | All | All | All | 167 | 55.2645 | 0.0 | 8.3333 | 100.0000 | 116.6667 | 158.3333 | NaN | 66172 | 0.1395 | 0.0 | 0.0 | NaN | 0.0 | NaN | NaN | NaN | NaN |
| 4 | 0 | China | 2002 | A000N | Buckwheat | All | All | female | 82 | 54.7053 | 0.0 | 8.3333 | 83.3333 | 116.6667 | 183.3333 | NaN | 33953 | 0.1321 | 0.0 | 0.0 | NaN | 0.0 | NaN | NaN | NaN | NaN |
food.drop(columns=food.loc[:, 'Consumers_P05':'Consumers_Standard_deviation'], inplace=True)
food.drop(columns=food.loc[:, 'Total_P05':'ExtBWValue'], inplace=True)
food = food.drop(['BW'], axis=1)
food.head(10)
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Consumers_Mean | Consumers_Median | Number_of_subjects | Total_Mean | Total_Median | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | China | 2002 | A000G | Oat grain | All | All | All | 1157 | 60.6230 | 0.0 | 66172 | 1.0600 | 0.0 |
| 1 | China | 2002 | A000G | Oat grain | All | All | female | 608 | 55.8676 | 0.0 | 33953 | 1.0004 | 0.0 |
| 2 | China | 2002 | A000G | Oat grain | All | All | male | 549 | 65.8895 | 0.0 | 32219 | 1.1227 | 0.0 |
| 3 | China | 2002 | A000N | Buckwheat | All | All | All | 167 | 55.2645 | 0.0 | 66172 | 0.1395 | 0.0 |
| 4 | China | 2002 | A000N | Buckwheat | All | All | female | 82 | 54.7053 | 0.0 | 33953 | 0.1321 | 0.0 |
| 5 | China | 2002 | A000N | Buckwheat | All | All | male | 85 | 55.8039 | 0.0 | 32219 | 0.1472 | 0.0 |
| 6 | China | 2002 | A000P | Barley grains | All | All | All | 61 | 38.5792 | 0.0 | 66172 | NaN | 0.0 |
| 7 | China | 2002 | A000P | Barley grains | All | All | female | 28 | 37.5000 | 0.0 | 33953 | NaN | 0.0 |
| 8 | China | 2002 | A000P | Barley grains | All | All | male | 33 | 39.4949 | 0.0 | 32219 | NaN | 0.0 |
| 9 | China | 2002 | A000T | Maize grain | All | All | All | 2422 | 86.9653 | 0.0 | 66172 | 3.1831 | 0.0 |
food.shape
(544686, 13)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
food.head()
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Consumers_Mean | Consumers_Median | Number_of_subjects | Total_Mean | Total_Median | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | China | 2002 | A000G | Oat grain | All | All | All | 1157 | 60.62 | 0.00 | 66172 | 1.06 | 0.00 |
| 1 | China | 2002 | A000G | Oat grain | All | All | female | 608 | 55.87 | 0.00 | 33953 | 1.00 | 0.00 |
| 2 | China | 2002 | A000G | Oat grain | All | All | male | 549 | 65.89 | 0.00 | 32219 | 1.12 | 0.00 |
| 3 | China | 2002 | A000N | Buckwheat | All | All | All | 167 | 55.26 | 0.00 | 66172 | 0.14 | 0.00 |
| 4 | China | 2002 | A000N | Buckwheat | All | All | female | 82 | 54.71 | 0.00 | 33953 | 0.13 | 0.00 |
food.shape
(544686, 13)
food = food.drop_duplicates()
food.shape
(523276, 13)
544686-523276
21410
msno.bar(food)
<Axes: >
food.isna().sum()
Country 0 Year 0 FoodCode 20 FoodName 624 AgeClass 0 SourceAgeClass 0 Gender 0 Number_of_consumers 0 Consumers_Mean 32 Consumers_Median 4532 Number_of_subjects 0 Total_Mean 5102 Total_Median 3 dtype: int64
null_data = food[food.isnull().any(axis=1)]
null_data.tail(5) # NaN w FoodCode widać dopiero na 50 ale dla wygody pozostaje default
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Consumers_Mean | Consumers_Median | Number_of_subjects | Total_Mean | Total_Median | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 540608 | Republic of Korea | 2015 | fa6adbfab52e8a77f23df411f59c2150 | NaN | Children and Adolescents | 3-5 years | All | 692 | 2.26 | 0.84 | 804 | 1.95 | 0.59 |
| 540609 | Republic of Korea | 2015 | fa6adbfab52e8a77f23df411f59c2150 | NaN | Children and Adolescents | 6-14 years | All | 1937 | 1.89 | 0.81 | 2376 | 1.54 | 0.48 |
| 540610 | Republic of Korea | 2015 | fa6adbfab52e8a77f23df411f59c2150 | NaN | Adults and Elderly | 15-49 years | All | 6798 | 1.80 | 0.76 | 8253 | 1.48 | 0.48 |
| 540611 | Republic of Korea | 2015 | fa6adbfab52e8a77f23df411f59c2150 | NaN | Adults and Elderly | 50-74 years | All | 6054 | 2.04 | 0.76 | 7069 | 1.74 | 0.49 |
| 540612 | Republic of Korea | 2015 | fa6adbfab52e8a77f23df411f59c2150 | NaN | Adults and Elderly | >75 years | All | 1204 | 1.97 | 0.40 | 1650 | 1.44 | 0.11 |
food.loc[527540]
Country Democratic Republic of the Congo Year 2016 FoodCode NaN FoodName NaN AgeClass All SourceAgeClass All Gender All Number_of_consumers 1 Consumers_Mean 3.36 Consumers_Median 3.36 Number_of_subjects 214 Total_Mean 0.02 Total_Median 0.00 Name: 527540, dtype: object
food = food.dropna(subset=['FoodName'])
food.isna().sum()
Country 0 Year 0 FoodCode 0 FoodName 0 AgeClass 0 SourceAgeClass 0 Gender 0 Number_of_consumers 0 Consumers_Mean 32 Consumers_Median 4532 Number_of_subjects 0 Total_Mean 5102 Total_Median 3 dtype: int64
food.loc[food['FoodCode'] == "fa6adbfab52e8a77f23df411f59c2150"]
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Consumers_Mean | Consumers_Median | Number_of_subjects | Total_Mean | Total_Median |
|---|
temp = food['FoodCode'].str.len() > 5
temp.value_counts()
FoodCode False 516640 True 6012 Name: count, dtype: int64
checkpoint = food #tworzę checkpoint żeby móc łatwo wrócić do wersji przed sortowaniem
food['CodeLen'] = food['FoodCode'].str.len()
food.sort_values(by=['CodeLen'])
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Consumers_Mean | Consumers_Median | Number_of_subjects | Total_Mean | Total_Median | CodeLen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | China | 2002 | A000G | Oat grain | All | All | All | 1157 | 60.62 | 0.00 | 66172 | 1.06 | 0.00 | 5 |
| 348298 | Portugal | 2015 | A00DH | Oat rolled grains | Children and Adolescents | Other children | Female | 1 | 0.40 | 0.40 | 262 | 0.00 | 0.00 | 5 |
| 348297 | Portugal | 2015 | A006R | Traditional unleavened breads | Children and Adolescents | Other children | Female | 1 | 1.94 | 1.94 | 262 | 0.01 | 0.00 | 5 |
| 348296 | Portugal | 2015 | A006P | Rusk, wholemeal | Children and Adolescents | Other children | Male | 1 | 1.63 | 1.63 | 259 | 0.01 | 0.00 | 5 |
| 348295 | Portugal | 2015 | A006P | Rusk, wholemeal | Children and Adolescents | Other children | Female | 4 | 0.65 | 0.59 | 262 | 0.01 | 0.00 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 251182 | Nigeria | 2011 | 17348df7c5044de2950b5bce2d8bb912 | (All Meat and meat products) | Children and Adolescents | 6-14 years | female | 0 | 0.00 | 0.00 | 1 | 0.00 | 0.00 | 32 |
| 275702 | United States of America | 2010 | 356d09ef45eb5879c5a334a1b9441094 | (All Ingredients) | Adults and Elderly | >75 years | All | 1429 | 0.10 | 0.08 | 1595 | 0.09 | 0.07 | 32 |
| 275701 | United States of America | 2010 | 356d09ef45eb5879c5a334a1b9441094 | (All Ingredients) | Adults and Elderly | 50-74 years | All | 4496 | 0.11 | 0.08 | 5215 | 0.10 | 0.06 | 32 |
| 275699 | United States of America | 2010 | 356d09ef45eb5879c5a334a1b9441094 | (All Ingredients) | Children and Adolescents | 6-14 years | All | 3215 | 0.07 | 0.04 | 4330 | 0.05 | 0.02 | 32 |
| 217932 | Italy | 2006 | eb31155c944a8c91ec4a53503cf50264 | (All Nuts) | Adults and Elderly | 15-49 years | female | 56 | 14.83 | 11.20 | 873 | 0.95 | 0.00 | 32 |
522652 rows × 14 columns
temp = food.loc[food['FoodCode'] == "a93a0316b93a7c2af9305e90012af119"]
len(temp)
458
food = checkpoint # powrót do checkpointa
food_all = food.loc[food['Gender'] == "All"]
food_all_oat = food_all.loc[food_all['FoodName'] == "Oat grain"]
food_all_oat_ch = food_all_oat.loc[food_all_oat['Country'] == "China"]
food_all_oat_ch.head(20)
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Consumers_Mean | Consumers_Median | Number_of_subjects | Total_Mean | Total_Median | CodeLen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | China | 2002 | A000G | Oat grain | All | All | All | 1157 | 60.62 | 0.00 | 66172 | 1.06 | 0.00 | 5 |
| 812 | China | 2002 | A000G | Oat grain | Infants and Toddlers | 0-35 months | All | 10 | 28.58 | 0.00 | 838 | 0.34 | 0.00 | 5 |
| 1251 | China | 2002 | A000G | Oat grain | Children and Adolescents | 3-5 years | All | 20 | 22.00 | 0.00 | 2235 | 0.20 | 0.00 | 5 |
| 1782 | China | 2002 | A000G | Oat grain | Children and Adolescents | 6-14 years | All | 107 | 39.97 | 0.00 | 9844 | 0.43 | 0.00 | 5 |
| 2487 | China | 2002 | A000G | Oat grain | Adults and Elderly | 15-49 years | All | 545 | 66.67 | 0.00 | 33719 | 1.08 | 0.00 | 5 |
| 3267 | China | 2002 | A000G | Oat grain | Adults and Elderly | 50-74 years | All | 438 | 62.06 | 0.00 | 18143 | 1.50 | 0.00 | 5 |
| 4020 | China | 2002 | A000G | Oat grain | Adults and Elderly | >75 years | All | 37 | 43.87 | 0.00 | 1393 | 1.17 | 0.00 | 5 |
| 279440 | China | 2002 | A000G | Oat grain | All | All | All | 1157 | 1.12 | NaN | 66172 | NaN | 0.00 | 5 |
| 280252 | China | 2002 | A000G | Oat grain | Infants and Toddlers | 0-35 months | All | 10 | 2.00 | NaN | 838 | NaN | 0.00 | 5 |
| 280691 | China | 2002 | A000G | Oat grain | Children and Adolescents | 3-5 years | All | 20 | 1.31 | NaN | 2235 | NaN | 0.00 | 5 |
| 281222 | China | 2002 | A000G | Oat grain | Children and Adolescents | 6-14 years | All | 107 | 1.35 | NaN | 9844 | NaN | 0.00 | 5 |
| 281927 | China | 2002 | A000G | Oat grain | Adults and Elderly | 15-49 years | All | 545 | 1.12 | NaN | 33719 | NaN | 0.00 | 5 |
| 282707 | China | 2002 | A000G | Oat grain | Adults and Elderly | 50-74 years | All | 438 | 1.06 | NaN | 18143 | NaN | 0.00 | 5 |
| 283460 | China | 2002 | A000G | Oat grain | Adults and Elderly | >75 years | All | 37 | 0.79 | NaN | 1393 | NaN | 0.00 | 5 |
food_all_oat_ch['Consumers_Mean'].iloc[0]
60.623
1157/66172*100 # procent konsumentów z całej puli badanych
1.748473674666022
66172/1157 #liczba badanych podzielona przez liczbę konsumentów
57.19273984442524
(1.12+66172)/1157 # połączona liczba badanych podzielona przez liczbę konsumentów
57.193707865168534
food_all_oat_ch['Consumers_Mean'].iloc[1:7].sum() # suma średnich dla rzędów 1-6
263.15090000000004
food_all_oat_ch['Consumers_Mean'].iloc[1:7].mean() # średnia ze średnich
43.85848333333334
food_all_oat_ch['Number_of_consumers'].iloc[:7].mean() # średnia z pierwszyć 7 rzędów
330.57142857142856
food_all_oat_ch['Number_of_consumers'].iloc[0]/food_all['Number_of_consumers'].iloc[1:7].sum() # wartość dla all podzielona przez sume wartości rzędów 1-6
0.01670516892867456
food.shape
(522652, 14)
food = food.drop(['Consumers_Mean', 'Consumers_Median', 'Total_Mean', 'Total_Median', 'CodeLen'], axis=1)
food.shape
(522652, 9)
food
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Number_of_subjects | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | China | 2002 | A000G | Oat grain | All | All | All | 1157 | 66172 |
| 1 | China | 2002 | A000G | Oat grain | All | All | female | 608 | 33953 |
| 2 | China | 2002 | A000G | Oat grain | All | All | male | 549 | 32219 |
| 3 | China | 2002 | A000N | Buckwheat | All | All | All | 167 | 66172 |
| 4 | China | 2002 | A000N | Buckwheat | All | All | female | 82 | 33953 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 544675 | Republic of Korea | 2015 | A16MR | Juice concentrate, pineapple | Adults and Elderly | 15-49 years | male | 1 | 3518 |
| 544679 | Republic of Korea | 2015 | A16MR | Juice concentrate, pineapple | All | All | male | 1 | 9114 |
| 544680 | Republic of Korea | 2015 | A16MR | Juice concentrate, pineapple | All | All | female | 2 | 11557 |
| 544684 | Republic of Korea | 2015 | A16MR | Juice concentrate, pineapple | Adults and Elderly | 15-49 years | All | 1 | 8253 |
| 544685 | Republic of Korea | 2015 | A16MR | Juice concentrate, pineapple | Adults and Elderly | 50-74 years | All | 2 | 7069 |
522652 rows × 9 columns
food.shape
(522652, 9)
food = food.drop_duplicates()
food.shape
(272016, 9)
544032-272016
272016
food_all = food.loc[food['Gender'] == "All"]
food_all_oat = food_all.loc[food_all['FoodName'] == "Oat grain"]
food_all_oat_ch = food_all_oat.loc[food_all_oat['Country'] == "China"]
food_all_oat_ch.head(10)
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Number_of_subjects | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | China | 2002 | A000G | Oat grain | All | All | All | 1157 | 66172 |
| 812 | China | 2002 | A000G | Oat grain | Infants and Toddlers | 0-35 months | All | 10 | 838 |
| 1251 | China | 2002 | A000G | Oat grain | Children and Adolescents | 3-5 years | All | 20 | 2235 |
| 1782 | China | 2002 | A000G | Oat grain | Children and Adolescents | 6-14 years | All | 107 | 9844 |
| 2487 | China | 2002 | A000G | Oat grain | Adults and Elderly | 15-49 years | All | 545 | 33719 |
| 3267 | China | 2002 | A000G | Oat grain | Adults and Elderly | 50-74 years | All | 438 | 18143 |
| 4020 | China | 2002 | A000G | Oat grain | Adults and Elderly | >75 years | All | 37 | 1393 |
food_all_oat_ch['Number_of_consumers'].iloc[1:].sum()
1157
food_all_oat_ch['Number_of_consumers'].iloc[0]
1157
food['Gender'].unique()
array(['All', 'female', 'male', 'Female', 'Male'], dtype=object)
# zrobie tak bo tylko 3 kolumny zmieniam
food.loc[:, 'Gender'] = food.loc[:, 'Gender'].str.title()
food.loc[:, 'FoodName'] = food.loc[:, 'FoodName'].str.title()
food.loc[:, 'Country'] = food.loc[:, 'Country'].str.title()
food['Gender'].unique()
array(['All', 'Female', 'Male'], dtype=object)
food_all = food.loc[food['Gender'] == "All"]
food_fem = food.loc[food['Gender'] == "Female"]
food_men = food.loc[food['Gender'] == "Male"]
food_all_all = food_all.loc[food_all['AgeClass'] == "All"]
food_fem_all = food_fem.loc[food_fem['AgeClass'] == "All"]
food_men_all = food_men.loc[food_men['AgeClass'] == "All"]
food_all_all.head()
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Number_of_subjects | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | China | 2002 | A000G | Oat Grain | All | All | All | 1157 | 66172 |
| 3 | China | 2002 | A000N | Buckwheat | All | All | All | 167 | 66172 |
| 6 | China | 2002 | A000P | Barley Grains | All | All | All | 61 | 66172 |
| 9 | China | 2002 | A000T | Maize Grain | All | All | All | 2422 | 66172 |
| 12 | China | 2002 | A001B | Common Millet Grain | All | All | All | 9069 | 66172 |
food_fem_all.head(5)
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Number_of_subjects | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | China | 2002 | A000G | Oat Grain | All | All | Female | 608 | 33953 |
| 4 | China | 2002 | A000N | Buckwheat | All | All | Female | 82 | 33953 |
| 7 | China | 2002 | A000P | Barley Grains | All | All | Female | 28 | 33953 |
| 10 | China | 2002 | A000T | Maize Grain | All | All | Female | 1355 | 33953 |
| 13 | China | 2002 | A001B | Common Millet Grain | All | All | Female | 4740 | 33953 |
food_all_all['Country'].unique()
array(['China', 'Republic Of Korea', 'United States Of America', 'Brazil',
'Italy', "Lao People'S Democratic Republic", 'Mexico',
'Mozambique', 'Malaysia', 'Nigeria', 'Pakistan', 'Philippines',
'Burkina Faso', 'Bangladesh', 'Romania', 'Bulgaria', 'Uganda',
'Bolivia (Plurinational State Of)', 'Zambia',
'Democratic Republic Of The Congo', 'Ethiopia', 'Guatemala',
'India'], dtype=object)
food_all_all['Country'].nunique()
23
food['Country'].unique()
array(['China', 'Republic Of Korea', 'Cyprus', 'Czech Republic',
'Denmark', 'Finland', 'France', 'Greece', 'Hungary', 'Ireland',
'Latvia', 'Netherlands', 'Portugal', 'Romania', 'Slovenia',
'Spain', 'Sweden', 'United Kingdom', 'Austria', 'Belgium',
'Bulgaria', 'Croatia', 'United States Of America', 'Brazil',
'Italy', "Lao People'S Democratic Republic", 'Mexico',
'Mozambique', 'Malaysia', 'Nigeria', 'Pakistan', 'Philippines',
'Burkina Faso', 'Bangladesh', 'Uganda',
'Bolivia (Plurinational State Of)', 'Zambia',
'Democratic Republic Of The Congo', 'Ethiopia', 'Guatemala',
'India'], dtype=object)
food['Country'].nunique()
41
food_fem_all['Country'].nunique()
23
food_men_all['Country'].nunique()
14
null_data = food[food.isnull().any(axis=1)]
null_data.tail()
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Number_of_subjects |
|---|
food_fr = food.loc[food['Country'] == "France"]
food_fr.head(10)
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Number_of_subjects | |
|---|---|---|---|---|---|---|---|---|---|
| 37678 | France | 2007 | A03MQ | Shandy | Children and Adolescents | Other children | Male | 1 | 239 |
| 37679 | France | 2007 | A03MX | Wine, Red | Children and Adolescents | Other children | Female | 17 | 243 |
| 37680 | France | 2007 | A03MX | Wine, Red | Children and Adolescents | Other children | Male | 14 | 239 |
| 37681 | France | 2007 | A03MV | Wine, White | Children and Adolescents | Other children | Female | 19 | 243 |
| 37682 | France | 2007 | A03MV | Wine, White | Children and Adolescents | Other children | Male | 13 | 239 |
| 37683 | France | 2007 | A03ND | Cider | Children and Adolescents | Other children | Female | 3 | 243 |
| 37684 | France | 2007 | A037V | Pork Lard | Children and Adolescents | Other children | Female | 14 | 243 |
| 37685 | France | 2007 | A037V | Pork Lard | Children and Adolescents | Other children | Male | 9 | 239 |
| 37686 | France | 2007 | A038G | Duck Fat, Processed | Children and Adolescents | Other children | Male | 3 | 239 |
| 37687 | France | 2007 | A038H | Goose Fat, Processed | Children and Adolescents | Other children | Female | 14 | 243 |
food_fr['AgeClass'].unique()
array(['Children and Adolescents', 'Adults and Elderly',
'Infants and Toddlers'], dtype=object)
food_fr['Gender'].unique()
array(['Male', 'Female', 'All'], dtype=object)
food['AgeClass'].unique()
array(['All', 'Infants and Toddlers', 'Children and Adolescents',
'Adults and Elderly'], dtype=object)
food_fr = food_fr.loc[food_fr['Gender'] == "All"]
food_fr['Number_of_subjects'].unique().sum() # chyba to będzie rozwiązaniem
8539
food_ctry = food_all.groupby(['Country','Number_of_subjects']).sum().reset_index()
food_ctry = food_ctry.groupby(['Country']).sum().reset_index()
food_ctry.loc[food_ctry['Country'] == "France"]
| Country | Number_of_subjects | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | |
|---|---|---|---|---|---|---|---|---|---|
| 15 | France | 8539 | 15546648 | A036PA036VA037DA039CA03LGA03HGA03HHA032BA032CA... | Olive OilsRape Seed Oil, EdibleSunflower Seed ... | Infants and ToddlersInfants and ToddlersInfant... | InfantsInfantsInfantsInfantsInfantsInfantsInfa... | AllAllAllAllAllAllAllAllAllAllAllAllAllAllAllA... | 503329 |
food_ctry = food_ctry.sort_values(by='Number_of_subjects', ascending=False)
food_ctry['Country'].nunique()
41
most_sub_ctry = food_ctry[['Country','Number_of_subjects']]
most_sub_ctry.sum()
Country BrazilChinaUnited States Of AmericaRepublic Of... Number_of_subjects 523051 dtype: object
px.bar(most_sub_ctry, x="Country", y="Number_of_subjects", title="Number of subjects per country", labels={'Number_of_subjects':'Number of subjects', 'Country': 'Countries' })
top5 = most_sub_ctry['Number_of_subjects'].iloc[0:5].sum()
top5
388340
rest = most_sub_ctry['Number_of_subjects'].iloc[5:].sum()
rest
134711
top5/rest
2.882763842596373
def convert(row): # funkcja przypisująca kod kontynentu w zależności od kraju
cn_code = pc.country_name_to_country_alpha2(row.Country, cn_name_format="default")
conti_code = pc.country_alpha2_to_continent_code(cn_code)
return conti_code
continent = food
ctry_change = {
'Republic Of Korea' : 'South Korea',
'Bolivia (Plurinational State Of)' : 'Bolivia',
'United States Of America' : 'United States of America',
"Lao People'S Democratic Republic" : "Lao People's Democratic Republic",
"Democratic Republic Of The Congo" : "Democratic Republic of the Congo"
}
continent = continent.replace(ctry_change)
continent['ContinentCode'] = continent.apply(convert, axis=1)
continent
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Number_of_subjects | ContinentCode | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | China | 2002 | A000G | Oat Grain | All | All | All | 1157 | 66172 | AS |
| 1 | China | 2002 | A000G | Oat Grain | All | All | Female | 608 | 33953 | AS |
| 2 | China | 2002 | A000G | Oat Grain | All | All | Male | 549 | 32219 | AS |
| 3 | China | 2002 | A000N | Buckwheat | All | All | All | 167 | 66172 | AS |
| 4 | China | 2002 | A000N | Buckwheat | All | All | Female | 82 | 33953 | AS |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 272338 | India | 2015 | A0EQN | Soft Drinks With Minor Amounts Of Fruits Or Fl... | All | All | Female | 8 | 242 | AS |
| 272339 | India | 2015 | A0EQN | Soft Drinks With Minor Amounts Of Fruits Or Fl... | Adults and Elderly | 15-49 years | All | 8 | 242 | AS |
| 272340 | India | 2015 | A0F4S | Coconut Water | All | All | All | 7 | 242 | AS |
| 272341 | India | 2015 | A0F4S | Coconut Water | Adults and Elderly | 15-49 years | Female | 7 | 242 | AS |
| 272342 | India | 2015 | A0F4S | Coconut Water | All | All | Female | 7 | 242 | AS |
272016 rows × 10 columns
continent['ContinentCode'].unique()
array(['AS', 'EU', 'NA', 'SA', 'AF'], dtype=object)
conti_names = { # stworzenie słownika dla kontynentów, żeby móc zamienić kody kontynentów na nazwy kontynentów
'AS' : 'Asia',
'EU' : 'Europe',
'NA' : 'North America',
'SA' : 'South America',
'AF' : 'Africa'
}
continent['Continent'] = continent['ContinentCode'].map(conti_names)
continent
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Number_of_subjects | ContinentCode | Continent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | China | 2002 | A000G | Oat Grain | All | All | All | 1157 | 66172 | AS | Asia |
| 1 | China | 2002 | A000G | Oat Grain | All | All | Female | 608 | 33953 | AS | Asia |
| 2 | China | 2002 | A000G | Oat Grain | All | All | Male | 549 | 32219 | AS | Asia |
| 3 | China | 2002 | A000N | Buckwheat | All | All | All | 167 | 66172 | AS | Asia |
| 4 | China | 2002 | A000N | Buckwheat | All | All | Female | 82 | 33953 | AS | Asia |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 272338 | India | 2015 | A0EQN | Soft Drinks With Minor Amounts Of Fruits Or Fl... | All | All | Female | 8 | 242 | AS | Asia |
| 272339 | India | 2015 | A0EQN | Soft Drinks With Minor Amounts Of Fruits Or Fl... | Adults and Elderly | 15-49 years | All | 8 | 242 | AS | Asia |
| 272340 | India | 2015 | A0F4S | Coconut Water | All | All | All | 7 | 242 | AS | Asia |
| 272341 | India | 2015 | A0F4S | Coconut Water | Adults and Elderly | 15-49 years | Female | 7 | 242 | AS | Asia |
| 272342 | India | 2015 | A0F4S | Coconut Water | All | All | Female | 7 | 242 | AS | Asia |
272016 rows × 11 columns
continent['Continent'].unique()
array(['Asia', 'Europe', 'North America', 'South America', 'Africa'],
dtype=object)
food = continent
food_all = food.loc[food['Gender'] == "All"]
food_con = food_all.groupby(['ContinentCode','Number_of_subjects']).sum().reset_index()
food_con = food_con.groupby(['ContinentCode']).sum().reset_index()
food_con = food_con[['ContinentCode','Number_of_subjects']].sort_values(by='Number_of_subjects', ascending=False)
food_con = food_con.groupby(['ContinentCode']).sum().reset_index()
food_con.sum()
ContinentCode AFASEUNASA Number_of_subjects 518734 dtype: object
most_sub_ctry.sum()
Country BrazilChinaUnited States Of AmericaRepublic Of... Number_of_subjects 523051 dtype: object
food_ctry = food_all.groupby(['Country','Number_of_subjects']).sum().reset_index()
food_ctry = food_ctry.groupby(['Country']).sum().reset_index()
food_con = food_ctry.groupby(['ContinentCode','Number_of_subjects']).sum().reset_index()
food_con = food_con.groupby(['ContinentCode']).sum().reset_index()
food_con = food_con[['ContinentCode','Number_of_subjects']].sort_values(by='Number_of_subjects', ascending=False)
food_con.sum()
ContinentCode SASASASASASASASASASASASASASASASASASASASASASASA... Number_of_subjects 523051 dtype: object
food_con['ContinentCode'] = food_con['ContinentCode'].apply(lambda x: x[0:2])
food_con = food_con.groupby(['ContinentCode']).sum().reset_index().sort_values(by='Number_of_subjects', ascending=False)
food_con
| ContinentCode | Number_of_subjects | |
|---|---|---|
| 1 | AS | 187248 |
| 4 | SA | 144248 |
| 2 | EU | 108344 |
| 3 | NA | 73032 |
| 0 | AF | 10179 |
food_con['Continent'] = food_con['ContinentCode'].map(conti_names)
food_con
| ContinentCode | Number_of_subjects | Continent | |
|---|---|---|---|
| 1 | AS | 187248 | Asia |
| 4 | SA | 144248 | South America |
| 2 | EU | 108344 | Europe |
| 3 | NA | 73032 | North America |
| 0 | AF | 10179 | Africa |
fig = px.pie(food_con, values='Number_of_subjects', names='Continent', title='Percent of subjects per Continents',color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
it_oat_all = food_all.loc[food_all['FoodName'] == "Oat Grain"]
it_oat_all = it_oat_all.loc[it_oat_all['Country'] == "Italy"]
it_oat_all
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Number_of_subjects | ContinentCode | Continent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 212154 | Italy | 2006 | A000G | Oat Grain | All | All | All | 4 | 3307 | EU | Europe |
| 212169 | Italy | 2006 | A000G | Oat Grain | Infants and Toddlers | 0-35 months | All | 0 | 36 | EU | Europe |
| 212170 | Italy | 2006 | A000G | Oat Grain | Children and Adolescents | 3-5 years | All | 0 | 67 | EU | Europe |
| 212171 | Italy | 2006 | A000G | Oat Grain | Children and Adolescents | 6-14 years | All | 0 | 284 | EU | Europe |
| 212172 | Italy | 2006 | A000G | Oat Grain | Adults and Elderly | 15-49 years | All | 2 | 1603 | EU | Europe |
| 212173 | Italy | 2006 | A000G | Oat Grain | Adults and Elderly | 50-74 years | All | 2 | 1089 | EU | Europe |
| 212174 | Italy | 2006 | A000G | Oat Grain | Adults and Elderly | >75 years | All | 0 | 228 | EU | Europe |
it_oat_all[1:].groupby(['Country','FoodName']).sum().reset_index() # sprawdzam czy suma wartości bez SourceAgeClass=All są równe All
| Country | FoodName | Year | FoodCode | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Number_of_subjects | ContinentCode | Continent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Italy | Oat Grain | 12036 | A000GA000GA000GA000GA000GA000G | Infants and ToddlersChildren and AdolescentsCh... | 0-35 months3-5 years6-14 years15-49 years50-74... | AllAllAllAllAllAll | 4 | 3307 | EUEUEUEUEUEU | EuropeEuropeEuropeEuropeEuropeEurope |
# tworzę maskę która zawiera rzędy z All
mask = it_oat_all['AgeClass'] == 'All'
# używając ~ i maski pozbywam się z it_oat_all tego co zawiera maska
it_oat_all = it_oat_all[~mask]
it_oat_all
| Country | Year | FoodCode | FoodName | AgeClass | SourceAgeClass | Gender | Number_of_consumers | Number_of_subjects | ContinentCode | Continent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 212169 | Italy | 2006 | A000G | Oat Grain | Infants and Toddlers | 0-35 months | All | 0 | 36 | EU | Europe |
| 212170 | Italy | 2006 | A000G | Oat Grain | Children and Adolescents | 3-5 years | All | 0 | 67 | EU | Europe |
| 212171 | Italy | 2006 | A000G | Oat Grain | Children and Adolescents | 6-14 years | All | 0 | 284 | EU | Europe |
| 212172 | Italy | 2006 | A000G | Oat Grain | Adults and Elderly | 15-49 years | All | 2 | 1603 | EU | Europe |
| 212173 | Italy | 2006 | A000G | Oat Grain | Adults and Elderly | 50-74 years | All | 2 | 1089 | EU | Europe |
| 212174 | Italy | 2006 | A000G | Oat Grain | Adults and Elderly | >75 years | All | 0 | 228 | EU | Europe |
mask = food['AgeClass']== 'All'
food = food[~mask]
food_all = food.loc[food['Gender'] == "All"]
food_fem = food.loc[food['Gender'] == "Female"]
food_men = food.loc[food['Gender'] == "Male"]
most_consumed_all = food_all.groupby(['FoodName']).sum().reset_index().sort_values(by='Number_of_consumers', ascending=False)
most_consumed_fem = food_fem.groupby(['FoodName']).sum().reset_index().sort_values(by='Number_of_consumers', ascending=False)
most_consumed_men = food_men.groupby(['FoodName']).sum().reset_index().sort_values(by='Number_of_consumers', ascending=False)
most_consumed_all[['FoodName','Number_of_consumers']].head(10)
| FoodName | Number_of_consumers | |
|---|---|---|
| 1 | (All Cereals) | 142214 |
| 2811 | Wheat Bread And Rolls, White (Refined Flour) | 135436 |
| 1836 | Onions | 112344 |
| 2249 | Rice Grain, Polished | 105857 |
| 1997 | Pig Fresh Meat | 104118 |
| 2652 | Tap Water | 92605 |
| 2092 | Potatoes | 89154 |
| 375 | Carrots | 87843 |
| 1126 | Garlic | 84415 |
| 2243 | Rice Grain | 82719 |
show_most_consumed_all = most_consumed_all.head(30)
px.bar(show_most_consumed_all, x="FoodName", y="Number_of_consumers", title="Most popular foods in the World", labels={'Number_of_consumers':'Number of comsumers', 'FoodName': 'Food Names' })
food_all = food.loc[food['Gender'] == "All"]
food_all_eu = food_all.loc[food_all['Continent'] == "Europe"]
most_consumed_all_eu = food_all_eu.groupby(['FoodName']).sum().reset_index().sort_values(by='Number_of_consumers', ascending=False)
most_consumed_all_eu = most_consumed_all_eu.head(30)
px.bar(most_consumed_all_eu, x="FoodName", y="Number_of_consumers", title="Most popular food in Europe", labels={'Number_of_consumers':'Number of comsumers', 'FoodName': 'Food Names' })
show_most_consumed_men = most_consumed_men.head(15)
show_most_consumed_fem = most_consumed_fem.head(15)
first_line = go.Bar(x=show_most_consumed_men["FoodName"], y=show_most_consumed_men["Number_of_consumers"], name="Male")
second_line = go.Bar(x=show_most_consumed_fem["FoodName"], y=show_most_consumed_fem["Number_of_consumers"], name="Female")
fig = make_subplots(rows=1, cols=2)
fig.add_trace(first_line,row=1, col=1)
fig.add_trace(second_line,row=1, col=2)
fig.update_layout(title_text="Most popular foods for men and women")
fig.show()
show_most_consumed_fem[:1]['Number_of_consumers']/food_fem['Number_of_subjects'].unique().sum()*100
1 49.22 Name: Number_of_consumers, dtype: float64
show_most_consumed_men[:1]['Number_of_consumers']/food_men['Number_of_subjects'].unique().sum()*100
2694 48.40 Name: Number_of_consumers, dtype: float64
food_fem_eu = food_fem.loc[food_fem['Continent'] == "Europe"]
food_men_eu = food_men.loc[food_men['Continent'] == "Europe"]
most_consumed_fem_eu = food_fem_eu.groupby(['FoodName']).sum().reset_index().sort_values(by='Number_of_consumers', ascending=False)
most_consumed_men_eu = food_men_eu.groupby(['FoodName']).sum().reset_index().sort_values(by='Number_of_consumers', ascending=False)
show_most_consumed_men_eu = most_consumed_men_eu.head(15)
show_most_consumed_fem_eu = most_consumed_fem_eu.head(15)
first_line = go.Bar(x=show_most_consumed_men_eu["FoodName"], y=show_most_consumed_men_eu["Number_of_consumers"], name="Male")
second_line = go.Bar(x=show_most_consumed_fem_eu["FoodName"], y=show_most_consumed_fem_eu["Number_of_consumers"], name="Female")
fig = make_subplots(rows=1, cols=2)
fig.add_trace(first_line,row=1, col=1)
fig.add_trace(second_line,row=1, col=2)
fig.update_layout(title_text="Most popular foods for men and women in EU")
fig.show()